import os
import pandas as pd
import jieba
import re

# 设置路径
source_folder = 'D:/Code/PycharmProjects/250611_clean/爬虫数据_605'
output_folder = 'D:/Code/PycharmProjects/250611_clean/cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# 清洗函数
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# 批量处理
for filename in os.listdir(source_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(source_folder, filename)
        df = pd.read_csv(file_path)
        if '评论' not in df.columns:
            continue
        df_clean = df.dropna(subset=['评论']).drop_duplicates(subset=['评论'])
        df_clean['评论'] = df_clean['评论'].apply(clean_text)
        df_clean['分词'] = df_clean['评论'].apply(lambda x: " ".join(jieba.cut(x)))
        df_clean.to_csv(os.path.join(output_folder, filename), index=False)

print("批量数据清洗完成，文件已保存至 cleaned_data 文件夹。")
